package org.apache.solr.update.processor;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import org.apache.solr.common.params.SolrParams;

/**
 * <p>This implementation is copied from Apache Nutch. </p>
 * <p>An implementation of a page signature. It calculates an MD5 hash
 * of a plain text "profile" of a page.</p>
 * <p>The algorithm to calculate a page "profile" takes the plain text version of
 * a page and performs the following steps:
 * <ul>
 * <li>remove all characters except letters and digits, and bring all characters
 * to lower case,</li>
 * <li>split the text into tokens (all consecutive non-whitespace characters),</li>
 * <li>discard tokens equal or shorter than MIN_TOKEN_LEN (default 2 characters),</li>
 * <li>sort the list of tokens by decreasing frequency,</li>
 * <li>round down the counts of tokens to the nearest multiple of QUANT
 * (<code>QUANT = QUANT_RATE * maxFreq</code>, where <code>QUANT_RATE</code> is 0.01f
 * by default, and <code>maxFreq</code> is the maximum token frequency). If
 * <code>maxFreq</code> is higher than 1, then QUANT is always higher than 2 (which
 * means that tokens with frequency 1 are always discarded).</li>
 * <li>tokens, which frequency after quantization falls below QUANT, are discarded.</li>
 * <li>create a list of tokens and their quantized frequency, separated by spaces,
 * in the order of decreasing frequency.</li>
 * </ul>
 * This list is then submitted to an MD5 hash calculation.*/
public class TextProfileSignature extends MD5Signature {

    private float quantRate;
    private float minTokenLen;

    @Override
    public void init(SolrParams params) {
        quantRate = params.getFloat("quantRate", 0.01f);
        minTokenLen = params.getInt("minTokenLen", 2);
    }

    @Override
    public byte[] getSignature() {
        return super.getSignature();
    }

    @Override
    public void add(String content) {

        HashMap<String, Token> tokens = new HashMap<>();

        StringBuilder curToken = new StringBuilder();
        int maxFreq = 0;
        for (int i = 0; i < content.length(); i++) {
            char c = content.charAt(i);
            if (Character.isLetterOrDigit(c)) {
                curToken.append(Character.toLowerCase(c));
            }
            else {
                if (curToken.length() > 0) {
                    if (curToken.length() > minTokenLen) {
                        // add it
                        String s = curToken.toString();
                        Token tok = tokens.get(s);
                        if (tok == null) {
                            tok = new Token(0, s);
                            tokens.put(s, tok);
                        }
                        tok.cnt++;
                        if (tok.cnt > maxFreq) {
                            maxFreq = tok.cnt;
                        }
                    }
                    curToken.setLength(0);
                }
            }
        }
        // check the last token
        if (curToken.length() > minTokenLen) {
            // add it
            String s = curToken.toString();
            Token tok = tokens.get(s);
            if (tok == null) {
                tok = new Token(0, s);
                tokens.put(s, tok);
            }
            tok.cnt++;
            if (tok.cnt > maxFreq) {
                maxFreq = tok.cnt;
            }
        }
        
        Iterator<Token> it = tokens.values().iterator();
        ArrayList<Token> profile = new ArrayList<>();
        // calculate the QUANT value
        int quant = Math.round(maxFreq * quantRate);
        if (quant < 2) {
            if (maxFreq > 1) {
                quant = 2;
            } else {
                quant = 1;
            }
        }

        while (it.hasNext()) {
            Token t = it.next();
            // round down to the nearest QUANT
            t.cnt = (t.cnt / quant) * quant;
            // discard the frequencies below the QUANT
            if (t.cnt < quant) {
                continue;
            }
            profile.add(t);
        }

        Collections.sort(profile, new TokenComparator());
        StringBuilder newText = new StringBuilder();
        it = profile.iterator();
        while (it.hasNext()) {
            Token t = it.next();
            if (newText.length() > 0) {
                newText.append("\n");
            }
            newText.append(t.toString());
        }

        super.add(newText.toString());
    }

    private static class Token {

        public int cnt;
        public String val;

        public Token(int cnt, String val) {
            this.cnt = cnt;
            this.val = val;
        }

        @Override
        public String toString() {
            return val + " " + cnt;
        }
    }

    private static class TokenComparator implements Comparator<Token> {

        @Override
        public int compare(Token t1, Token t2) {
            return t2.cnt - t1.cnt;
        }
    }
}
